{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "a999e152",
   "metadata": {},
   "source": [
    "### P025 预估目标字段编码"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "db0a1c3b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "a67a84dd",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = {\n",
    "    'size': ['XL', 'L', 'M', 'L', 'M'],\n",
    "    'color': ['red', 'green', 'blue', 'green', 'red'],\n",
    "    'gender': ['female', 'male', 'male', 'female', 'female'],\n",
    "    'price': [199.0, 89.0, 99.0, 129.0, 79.0],\n",
    "    'weight': [500, 450, 300, 380, 410],\n",
    "    'bought': ['yes', 'no', 'yes', 'no', 'yes']\n",
    "}\n",
    " \n",
    "df = pd.DataFrame(data=data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "aa854603",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>size</th>\n",
       "      <th>color</th>\n",
       "      <th>gender</th>\n",
       "      <th>price</th>\n",
       "      <th>weight</th>\n",
       "      <th>bought</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>XL</td>\n",
       "      <td>red</td>\n",
       "      <td>female</td>\n",
       "      <td>199.0</td>\n",
       "      <td>500</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>L</td>\n",
       "      <td>green</td>\n",
       "      <td>male</td>\n",
       "      <td>89.0</td>\n",
       "      <td>450</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>M</td>\n",
       "      <td>blue</td>\n",
       "      <td>male</td>\n",
       "      <td>99.0</td>\n",
       "      <td>300</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>L</td>\n",
       "      <td>green</td>\n",
       "      <td>female</td>\n",
       "      <td>129.0</td>\n",
       "      <td>380</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>M</td>\n",
       "      <td>red</td>\n",
       "      <td>female</td>\n",
       "      <td>79.0</td>\n",
       "      <td>410</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  size  color  gender  price  weight bought\n",
       "0   XL    red  female  199.0     500    yes\n",
       "1    L  green    male   89.0     450     no\n",
       "2    M   blue    male   99.0     300    yes\n",
       "3    L  green  female  129.0     380     no\n",
       "4    M    red  female   79.0     410    yes"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "720a53a7",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import LabelEncoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "d51f7456",
   "metadata": {},
   "outputs": [],
   "source": [
    "labelEncoder = LabelEncoder()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "c772b66d",
   "metadata": {},
   "outputs": [],
   "source": [
    "df[\"bought\"] = labelEncoder.fit_transform(df[\"bought\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "8866197d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>size</th>\n",
       "      <th>color</th>\n",
       "      <th>gender</th>\n",
       "      <th>price</th>\n",
       "      <th>weight</th>\n",
       "      <th>bought</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>XL</td>\n",
       "      <td>red</td>\n",
       "      <td>female</td>\n",
       "      <td>199.0</td>\n",
       "      <td>500</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>L</td>\n",
       "      <td>green</td>\n",
       "      <td>male</td>\n",
       "      <td>89.0</td>\n",
       "      <td>450</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>M</td>\n",
       "      <td>blue</td>\n",
       "      <td>male</td>\n",
       "      <td>99.0</td>\n",
       "      <td>300</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>L</td>\n",
       "      <td>green</td>\n",
       "      <td>female</td>\n",
       "      <td>129.0</td>\n",
       "      <td>380</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>M</td>\n",
       "      <td>red</td>\n",
       "      <td>female</td>\n",
       "      <td>79.0</td>\n",
       "      <td>410</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  size  color  gender  price  weight  bought\n",
       "0   XL    red  female  199.0     500       1\n",
       "1    L  green    male   89.0     450       0\n",
       "2    M   blue    male   99.0     300       1\n",
       "3    L  green  female  129.0     380       0\n",
       "4    M    red  female   79.0     410       1"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "6e6ef77e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['no', 'yes'], dtype=object)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "labelEncoder.classes_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "bb63a5b3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['yes', 'no', 'yes', 'no', 'yes'], dtype=object)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "labelEncoder.inverse_transform(df[\"bought\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dfd7aacc",
   "metadata": {},
   "source": [
    "### P026 分类特征类型编码"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "1d0e159d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>size</th>\n",
       "      <th>color</th>\n",
       "      <th>gender</th>\n",
       "      <th>price</th>\n",
       "      <th>weight</th>\n",
       "      <th>bought</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>XL</td>\n",
       "      <td>red</td>\n",
       "      <td>female</td>\n",
       "      <td>199.0</td>\n",
       "      <td>500</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>L</td>\n",
       "      <td>green</td>\n",
       "      <td>male</td>\n",
       "      <td>89.0</td>\n",
       "      <td>450</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>M</td>\n",
       "      <td>blue</td>\n",
       "      <td>male</td>\n",
       "      <td>99.0</td>\n",
       "      <td>300</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>L</td>\n",
       "      <td>green</td>\n",
       "      <td>female</td>\n",
       "      <td>129.0</td>\n",
       "      <td>380</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>M</td>\n",
       "      <td>red</td>\n",
       "      <td>female</td>\n",
       "      <td>79.0</td>\n",
       "      <td>410</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  size  color  gender  price  weight  bought\n",
       "0   XL    red  female  199.0     500       1\n",
       "1    L  green    male   89.0     450       0\n",
       "2    M   blue    male   99.0     300       1\n",
       "3    L  green  female  129.0     380       0\n",
       "4    M    red  female   79.0     410       1"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "eeeb8d70",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 5 entries, 0 to 4\n",
      "Data columns (total 6 columns):\n",
      " #   Column  Non-Null Count  Dtype  \n",
      "---  ------  --------------  -----  \n",
      " 0   size    5 non-null      object \n",
      " 1   color   5 non-null      object \n",
      " 2   gender  5 non-null      object \n",
      " 3   price   5 non-null      float64\n",
      " 4   weight  5 non-null      int64  \n",
      " 5   bought  5 non-null      int32  \n",
      "dtypes: float64(1), int32(1), int64(1), object(3)\n",
      "memory usage: 348.0+ bytes\n"
     ]
    }
   ],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "2c847b4f",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import OneHotEncoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "94baa845",
   "metadata": {},
   "outputs": [],
   "source": [
    "oneHotEncoder = OneHotEncoder(sparse=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "b522b864",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "OneHotEncoder(sparse=False)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "oneHotEncoder.fit(df[[\"size\"]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "512f87b7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0., 0., 1.],\n",
       "       [1., 0., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [0., 1., 0.]])"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "oneHotEncoder.transform(df[[\"size\"]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "72485ff8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[array(['L', 'M', 'XL'], dtype=object)]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "oneHotEncoder.categories_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "279842ea",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
